#Objective: To analyze the differential expression between individuals who develop MPNST compared to individuals who don’t (pNF)
### load files
load("../data/salmon.result.metadata.Rda")
syn.query <- salmon.result[salmon.result$specimenID !=
"2-025 Neurofibroma", ]
metadata.df <- syn.query[, colSums(is.na(syn.query)) <
nrow(syn.query)]
cols.keep <- c("specimenID", "id", "individualID",
"sex", "tumorType", "isCellLine", "isPrimaryCell",
"tissue", "experimentalCondition", "consortium",
"study", "transplantationType")
metadata.df <- metadata.df[, cols.keep]
rownames(metadata.df) <- metadata.df$specimenIDFrom the transcript counts step to make sure no unprocessed transcripts, etc are included in the counts. Also only includes genes with gene symbols.
load("../data/prot_coding_counts_expData.Rda")
data.mat = reshape2::acast(expData, Symbol ~ specimenID,
value.var = "roundedCounts")
missing = which(apply(data.mat, 1, function(x) any(is.na(x))))
if (length(missing) > 0) data.mat = data.mat[-missing,
]
### manually change name of 2-004 specID
colnames(data.mat)[8] <- "2-004 Plexiform Neurofibroma"
### reorder data.mat
data.mat <- data.mat[, rownames(metadata.df)]### filter for pNFs that progress/dont to MPNST
# have pNF, have MPNST
pNF_progress <- (metadata.df %>% filter(individualID %in%
c("2-002", "2-003", "2-009", "2-013", "2-015",
"2-016", "2-023", "2-031") & tumorType == "Plexiform Neurofibroma"))$specimenID
# have pNF, don't have MPNST
pNF_noProgress <- (metadata.df %>% filter(!(individualID %in%
c("2-002", "2-003", "2-009", "2-013", "2-015",
"2-016", "2-023", "2-031")) & tumorType ==
"Plexiform Neurofibroma"))$specimenID
### just pNF MPNST
metadata.df_filtered <- metadata.df %>% filter(tumorType ==
"Plexiform Neurofibroma")
### add progress to MPNST col
metadata.df_filtered$MPNST_progression <- "progress"
metadata.df_filtered[which(metadata.df_filtered$specimenID %in%
pNF_progress), ]$MPNST_progression <- "progress"
metadata.df_filtered[which(metadata.df_filtered$specimenID %in%
pNF_noProgress), ]$MPNST_progression <- "no_progress"
### shorten words and remove spaces for downstream
metadata.df_filtered$tumorType <- gsub("Plexiform Neurofibroma",
"pNF", metadata.df_filtered$tumorType)
filtered_ids <- metadata.df_filtered$specimenID
data.mat <- data.mat[, filtered_ids]
metadata.df_filtered$specimenID <- gsub(" Plexiform Neurofibroma",
"pNF", metadata.df_filtered$specimenID)
### change colname in txi to match
colnames(data.mat) <- metadata.df_filtered$specimenID
rownames(metadata.df_filtered) <- metadata.df_filtered$specimenIDfilter out counts < 10 and set alpha to .05
# detach('package:synapser', unload=TRUE)
# unloadNamespace('PythonEmbedInR')
dds <- DESeqDataSetFromMatrix(countData = data.mat,
colData = metadata.df_filtered, design = ~MPNST_progression)
### filter out reads with low counts
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep, ]
dds <- DESeq(dds)
res <- results(dds, alpha = 0.05)
resultsNames(dds)## [1] "Intercept"
## [2] "MPNST_progression_progress_vs_no_progress"
resLFC <- lfcShrink(dds, coef = 2, type = "apeglm") # plot looks weird with this type
# resLFC <- lfcShrink(dds, coef = 3, type='normal')## log2 fold change (MLE): MPNST progression progress vs no progress
## Wald test p-value: MPNST progression progress vs no progress
## DataFrame with 17134 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## A2M 192620.141165638 1.2455849772124 1.37304432686945
## A2ML1 99.0484277047366 -1.47466203827885 1.17525586293471
## A3GALT2 7.78461394662319 0.258387312214684 1.53382824232674
## A4GALT 1721.88443067838 -0.400371382201492 0.516065131970599
## A4GNT 1.30935703259983 -1.01842624308863 2.06407610591747
## ... ... ... ...
## ZXDC 1140.88565560739 0.13446750582303 0.239422567321793
## ZYG11A 77.8995768418035 -3.78847839867316 1.392185152915
## ZYG11B 3655.22169560955 -0.150338630687026 0.27809965996628
## ZYX 8771.76995776608 0.423025104653367 0.460148987732322
## ZZEF1 2964.62897420378 0.359348058194729 0.435090664921054
## stat pvalue padj
## <numeric> <numeric> <numeric>
## A2M 0.907170258699759 0.364316768757964 0.762692129227961
## A2ML1 -1.25475829118308 0.209566517838882 0.659690701578338
## A3GALT2 0.168459091496922 0.866222123952563 0.973165064688762
## A4GALT -0.775815604268148 0.437857869132013 0.808646094365473
## A4GNT -0.493405374040675 0.621726185920339 0.900126265073314
## ... ... ... ...
## ZXDC 0.561632545031983 0.574366401837014 0.879062425858905
## ZYG11A -2.72124608622692 0.00650363266360434 0.307430821223801
## ZYG11B -0.540592644756398 0.588788388686632 0.884581657410083
## ZYX 0.919322036843095 0.35792715468426 0.758745150347109
## ZZEF1 0.82591534860885 0.40885211322699 0.790269538420699
##
## out of 17134 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 33, 0.19%
## LFC < 0 (down) : 45, 0.26%
## outliers [1] : 518, 3%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
The LFC is of the MPNST progress / no progress
Dispersion is a metric for how much the variance differs from the mean in a negative binomial distribution.
logFC shrink
res_df = as.data.frame(dplyr::mutate(as.data.frame(res),
sig = ifelse(res$padj < 0.05, "FDR<0.05", "Not Sig")),
row.names = rownames(res))
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
# mart <- useMart(biomart = 'ENSEMBL_MART_ENSEMBL',
# dataset = 'hsapiens_gene_ensembl', host =
# 'uswest.ensembl.org')
write.csv(res_df, file = "pNF_v_MPNST_geneList.csv",
row.names = TRUE)normalized by variance stabilizing transformation and displaying the top 50 highest expressed genes
vsd <- vst(dds, blind = FALSE)
# rename gene
vsd_matrix <- SummarizedExperiment::assay(vsd)
select <- order(rowMeans(vsd_matrix), decreasing = TRUE)[1:50]
top_genes <- vsd_matrix[select, ]
top_genes = top_genes - rowMeans(top_genes) # Subtract the row means from each value to standardize
df <- as.data.frame(colData(dds)[, c("sex", "MPNST_progression")])
pheatmap(top_genes, cluster_rows = TRUE, show_rownames = TRUE,
cluster_cols = TRUE, annotation_col = df, color = rev(brewer.pal(11,
"Spectral"))) Just by highest expressed genes they don’t cluster as expected, there is a pNF with no MPST progression that doesn’t cluster with other no MPST progression individuals
rld <- rlog(dds, blind = F)
# exp_matrix <- SummarizedExperiment::assay(rld)
# ### top 100 select <- order(rowMeans(exp_matrix),
# decreasing=TRUE)[1:20] top100 <-
# exp_matrix[select,] annotation_data <-
# as.data.frame(colData(rld)['MPNST_progression'] )
# pheatmap(exp_matrix,
# annotation_col=annotation_data, color =
# rev(brewer.pal(11, 'Spectral')))
mat <- assay(vsd)
mat_df <- as.data.frame(mat)
mat_prot_df <- mat_df[head(order(res[rownames(res),
]$padj), 50), ] # select the top 50 genes with the lowest padj
mat_prot_df = mat_prot_df - rowMeans(mat_prot_df) # Subtract the row means from each value to standardize
## remove gene name rows
mat_prot_matrix <- as.matrix(mat_prot_df)
df = as.data.frame(colData(rld)[, c("sex", "MPNST_progression")]) # Create a dataframe with a column of the conditions
rownames(df) = colnames(mat) # add rownames
# and plot the actual heatmap
pheatmap(mat_prot_matrix, annotation_col = df, color = rev(brewer.pal(11,
"Spectral")))sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$specimenID,
vsd$MPNST_progression, sep = "-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMatrix, clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists, col = colors,
show_rownames = TRUE)They don’t cluster by progression type
p <- plotPCA(vsd, intgroup = c("individualID", "MPNST_progression",
"sex"), returnData = TRUE)
percentVar <- round(100 * attr(p, "percentVar"))
ggplot(p, aes(PC1, PC2, color = individualID, shape = sex,
label = vsd$MPNST_progression)) + geom_point(size = 3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed() + ggrepel::geom_text_repel()### https://www.biostars.org/p/282295/
par(mar = c(5, 5, 5, 5), cex = 1, cex.main = 1.4, cex.axis = 1.4,
cex.lab = 1.4)
topT <- as.data.frame(res)
# Adjusted P values (FDR Q values)
with(topT, plot(log2FoldChange, -log10(padj), pch = 20,
main = "Volcano plot", cex = 1, xlab = bquote(~Log[2] ~
fold ~ change), ylab = bquote(~-log[10] ~ p ~
adj)))
with(subset(topT, padj < 0.05 & abs(log2FoldChange) >
2), points(log2FoldChange, -log10(padj), pch = 20,
col = "red", cex = 1))
# Add lines for absolute FC>2 and P-value cut-off
# at FDR Q<0.05
abline(v = 0, col = "black", lty = 3, lwd = 1)
abline(v = -2, col = "black", lty = 4, lwd = 2)
abline(v = 2, col = "black", lty = 4, lwd = 2)
abline(h = -log10(max(topT$pvalue[topT$padj < 0.05],
na.rm = TRUE)), col = "black", lty = 4, lwd = 2)# https://shiring.github.io/rna-seq/deseq2/teaching/2016/09/29/DESeq2-course
# get entrezid
entrez_list <- getBM(filters = "hgnc_symbol", attributes = c("hgnc_symbol",
"entrezgene_id"), values = rownames(mat_df), mart = mart)
mat_df$hgnc_symbol <- rownames(mat_df)
mat_df <- inner_join(mat_df, entrez_list, by = "hgnc_symbol")This is the GO profile using a biological processes subontology
OrgDb <- org.Hs.eg.db # can also be other organisms
gene <- na.omit(mat_df$entrezgene)
# Group GO
ggo <- clusterProfiler::groupGO(gene = as.character(gene),
OrgDb = OrgDb, ont = "BP", level = 3, readable = TRUE)## Loading required package: DOSE
## DOSE v3.8.2 For help: https://guangchuangyu.github.io/DOSE
##
## If you use DOSE in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
Using a p value and q value of .05, this returns the enrichment GO categories. Shown are the top 6
# GO over-representation test
ego <- clusterProfiler::enrichGO(gene = gene, OrgDb = OrgDb,
ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05,
qvalueCutoff = 0.05, readable = TRUE)
DT::datatable(head(summary(ego)[, -8]), rownames = FALSE)## Warning in summary(ego): summary method to convert the object to data.frame
## is deprecated, please use as.data.frame instead.
###KEGG pathways